import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import folium
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, auc
#https://www.kaggle.com/anthonypino/melbourne-housing-market?select=Melbourne_housing_FULL.csv
df = pd.read_csv('housing_data_full.csv')
df = df[~df['Price'].isnull()] # Removing all entries where the target value is null
df = df.drop(columns=['Address','Postcode'])
df = df.dropna()
df['Date'] = pd.to_datetime(df['Date'])
df.Price = df.Price.astype(int)
df = df.reset_index(drop=True)
display(df.columns)
pd.concat([df.nunique(),df.dtypes], axis=1)
Index(['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Date',
'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude', 'Regionname',
'Propertycount'],
dtype='object')
| 0 | 1 | |
|---|---|---|
| Suburb | 315 | object |
| Rooms | 10 | int64 |
| Type | 3 | object |
| Price | 1846 | int32 |
| Method | 5 | object |
| SellerG | 250 | object |
| Date | 77 | datetime64[ns] |
| Distance | 201 | float64 |
| Bedroom2 | 12 | float64 |
| Bathroom | 9 | float64 |
| Car | 11 | float64 |
| Landsize | 1237 | float64 |
| BuildingArea | 465 | float64 |
| YearBuilt | 143 | float64 |
| CouncilArea | 33 | object |
| Lattitude | 5661 | float64 |
| Longtitude | 6070 | float64 |
| Regionname | 8 | object |
| Propertycount | 312 | float64 |
df['Regionname'].unique()
array(['Northern Metropolitan', 'Western Metropolitan',
'Southern Metropolitan', 'Eastern Metropolitan',
'South-Eastern Metropolitan', 'Northern Victoria',
'Eastern Victoria', 'Western Victoria'], dtype=object)
df.head()
| Suburb | Rooms | Type | Price | Method | SellerG | Date | Distance | Bedroom2 | Bathroom | Car | Landsize | BuildingArea | YearBuilt | CouncilArea | Lattitude | Longtitude | Regionname | Propertycount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abbotsford | 2 | h | 1035000 | S | Biggin | 2016-04-02 | 2.5 | 2.0 | 1.0 | 0.0 | 156.0 | 79.0 | 1900.0 | Yarra City Council | -37.8079 | 144.9934 | Northern Metropolitan | 4019.0 |
| 1 | Abbotsford | 3 | h | 1465000 | SP | Biggin | 2017-04-03 | 2.5 | 3.0 | 2.0 | 0.0 | 134.0 | 150.0 | 1900.0 | Yarra City Council | -37.8093 | 144.9944 | Northern Metropolitan | 4019.0 |
| 2 | Abbotsford | 4 | h | 1600000 | VB | Nelson | 2016-04-06 | 2.5 | 3.0 | 1.0 | 2.0 | 120.0 | 142.0 | 2014.0 | Yarra City Council | -37.8072 | 144.9941 | Northern Metropolitan | 4019.0 |
| 3 | Abbotsford | 3 | h | 1876000 | S | Nelson | 2016-07-05 | 2.5 | 4.0 | 2.0 | 0.0 | 245.0 | 210.0 | 1910.0 | Yarra City Council | -37.8024 | 144.9993 | Northern Metropolitan | 4019.0 |
| 4 | Abbotsford | 2 | h | 1636000 | S | Nelson | 2016-08-10 | 2.5 | 2.0 | 1.0 | 2.0 | 256.0 | 107.0 | 1890.0 | Yarra City Council | -37.8060 | 144.9954 | Northern Metropolitan | 4019.0 |
plt.figure(figsize=(16,8))
display(df.corr()['Price'].sort_values(ascending = False))
df.corr()['Price'].sort_values(ascending = False).plot(kind='bar')
Price 1.000000 BuildingArea 0.507278 Rooms 0.475074 Bathroom 0.463501 Bedroom2 0.460880 Longtitude 0.212174 Car 0.209464 Landsize 0.058375 Propertycount -0.059720 Lattitude -0.224255 Distance -0.231212 YearBuilt -0.313664 Name: Price, dtype: float64
<AxesSubplot:>
df[(df['Regionname']=='Northern Metropolitan')&(df['Rooms']==2)][['Price']].sort_values(by='Price',ascending = False).plot(kind='hist', bins=20, figsize=(10, 6))
<AxesSubplot:ylabel='Frequency'>
#df[(df['Regionname']=='Northern Metropolitan')&(df['Rooms']==2)][['Date','Price']].sort_values(by='Date',ascending = False).set_index('Date').plot(figsize=(20, 6))
two_rooms_df = df[(df['Rooms']==2)]
two_rooms_df[['Price']].sort_values(by='Price',ascending = False).plot(kind='hist', bins=20, figsize=(10, 5))
<AxesSubplot:ylabel='Frequency'>
two_rooms_df[['Price']].sort_values(by='Price',ascending = False).plot(kind='box',vert=False, figsize=(16, 4))
<AxesSubplot:>
display(two_rooms_df[['Price']].describe().T)
#Mean > Median -> Right Skew
desc = two_rooms_df[['Price']].describe().T
is_right_skewed = 'Yes' if (desc.loc['Price', 'mean'] > desc.loc['Price', '50%']) else 'No'
print(f'Is right skewed?: { is_right_skewed }')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Price | 1912.0 | 747038.203452 | 312862.042372 | 250000.0 | 520000.0 | 670000.0 | 890000.0 | 2875000.0 |
Is right skewed?: Yes
Q3 = desc.loc['Price', '75%']
Q1 = desc.loc['Price', '25%']
IQR = Q3 - Q1
upper_threshold = Q3 + (1.5 * IQR)
upper_threshold
extreme_prices= two_rooms_df[(two_rooms_df['Price']>upper_threshold)]
normal_prices= two_rooms_df[(two_rooms_df['Price']<upper_threshold)]
#df[(df['Price']>upper_threshold)&(df['Rooms']==2)]
m = folium.Map(location=[-37.8,145], zoom_start=11)
for lat,long in zip(normal_prices.Lattitude, normal_prices.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="green",
fill=False,
).add_to(m)
for lat,long in zip(extreme_prices.Lattitude, extreme_prices.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="red",
fill=False,
).add_to(m)
# folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( m )
m
normal_prices_in_a_region = normal_prices[(normal_prices['Regionname']=='Northern Metropolitan')]
extreme_prices_in_a_region = extreme_prices[(extreme_prices['Regionname']=='Northern Metropolitan')]
m = folium.Map(location=[-37.8,145], zoom_start=11)
for lat,long in zip(normal_prices_in_a_region.Lattitude, normal_prices_in_a_region.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="green",
fill=False,
).add_to(m)
for lat,long in zip(extreme_prices_in_a_region.Lattitude, extreme_prices_in_a_region.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="red",
fill=False,
).add_to(m)
# folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( m )
m
normal_prices_in_a_Suburb = normal_prices[(normal_prices['Suburb']=='Abbotsford')]
extreme_prices_in_a_Suburb = extreme_prices[(extreme_prices['Suburb']=='Abbotsford')]
m = folium.Map(location=[-37.8,145], zoom_start=14)
for lat,long in zip(normal_prices_in_a_Suburb.Lattitude, normal_prices_in_a_Suburb.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="green",
fill=False,
).add_to(m)
for lat,long in zip(extreme_prices_in_a_Suburb.Lattitude, extreme_prices_in_a_Suburb.Longtitude):
folium.Circle(
radius=10,
location=[lat,long],
popup="The Waterfront",
color="red",
fill=False,
).add_to(m)
# folium.Marker( location=[ coord[0], coord[1] ], fill_color='#43d9de', radius=8 ).add_to( m )
m
number of rooms is positively corelated with the price
df['PriceReasoning'] = 0 # 0 - Normal price; 1 - Extreme price
rooms_counts = df['Rooms'].unique()
for room_count in rooms_counts:
rooms_df=df[(df['Rooms']==room_count)]
desc = rooms_df[['Price']].describe().T
Q3 = desc.loc['Price', '75%']
Q1 = desc.loc['Price', '25%']
IQR = Q3 - Q1
upper_threshold = Q3 + (1.5 * IQR)
extreme_prices= rooms_df[(rooms_df['Price']>upper_threshold)]
normal_prices= rooms_df[(rooms_df['Price']<upper_threshold)]
df.loc[list(extreme_prices.index),['PriceReasoning']] = 1
#df.loc[list(normal_prices.index),['PriceReasoning']] = 0
total_e_prices = df[['PriceReasoning']].sum()
print(f'Total extreme priced houses : {total_e_prices.PriceReasoning}')
Total extreme priced houses : 291
plt.figure(figsize=(16,8))
display(df.corr()['PriceReasoning'].sort_values(ascending = False))
df.corr()['PriceReasoning'].sort_values(ascending = False).plot(kind='bar')
PriceReasoning 1.000000 Price 0.471453 BuildingArea 0.123791 Bathroom 0.096285 Car 0.035364 Longtitude 0.035235 Landsize 0.008202 Bedroom2 0.004085 Propertycount -0.003089 Rooms -0.009698 Lattitude -0.075807 Distance -0.125653 YearBuilt -0.153093 Name: PriceReasoning, dtype: float64
<AxesSubplot:>
info=pd.concat([df.nunique(),df.dtypes], axis=1)
info.sort_values(by=1)
| 0 | 1 | |
|---|---|---|
| Price | 1846 | int32 |
| PriceReasoning | 2 | int64 |
| Rooms | 10 | int64 |
| Longtitude | 6070 | float64 |
| Lattitude | 5661 | float64 |
| YearBuilt | 143 | float64 |
| BuildingArea | 465 | float64 |
| Landsize | 1237 | float64 |
| Car | 11 | float64 |
| Bathroom | 9 | float64 |
| Bedroom2 | 12 | float64 |
| Distance | 201 | float64 |
| Date | 77 | datetime64[ns] |
| Propertycount | 312 | float64 |
| SellerG | 250 | object |
| CouncilArea | 33 | object |
| Method | 5 | object |
| Type | 3 | object |
| Regionname | 8 | object |
| Suburb | 315 | object |
df = df.drop(columns=['SellerG','Date','Suburb']) # High cardinality columns are removed as they will explode while creating one hot encoding
x = df.drop(columns=['PriceReasoning'])
ohe_df = pd.get_dummies(x)
ohe_df['PriceReasoning'] = df['PriceReasoning']
plt.figure(figsize=(100,8))
display(ohe_df.corr()['PriceReasoning'].sort_values(ascending = False))
ohe_df.corr()['PriceReasoning'].sort_values(ascending = False).plot(kind='bar')
PriceReasoning 1.000000
Price 0.471453
Regionname_Southern Metropolitan 0.191453
CouncilArea_Stonnington City Council 0.176749
BuildingArea 0.123791
...
Type_u -0.059180
Lattitude -0.075807
Regionname_Western Metropolitan -0.092045
Distance -0.125653
YearBuilt -0.153093
Name: PriceReasoning, Length: 62, dtype: float64
<AxesSubplot:>
X = ohe_df.drop(columns=['PriceReasoning'])
Y = ohe_df['PriceReasoning']
features = X.columns.values
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X.columns = features
"""
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
"""
rand_under_sample = RandomUnderSampler(random_state=1)
X_u, Y_u = rand_under_sample.fit_resample(X,Y)
"""
tomekl = TomekLinks(n_jobs=3)
X_u, Y_u = tomekl.fit_resample(X, Y)
"""
X_train, X_test, y_train, y_test = train_test_split(X_u, Y_u, test_size=0.3, random_state=1)
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
X_pca = PCA(n_components=2, random_state=1).fit_transform(X_train)
X_tsne = TSNE(n_components=2, random_state=1).fit_transform(X_train)
red_patch = mpatches.Patch(color='#AF0000', label='Yes')
blue_patch = mpatches.Patch(color='#0A0AFF', label='No')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(26,8))
fig.suptitle('2 Clusters from Dimensionality Reduction', fontsize=18)
ax1.set_title('PCA', fontsize=16)
ax1.scatter(X_pca[:,0], X_pca[:,1], s=4, c=(y_train == 0), cmap='coolwarm', linewidths=1)
ax1.scatter(X_pca[:,0], X_pca[:,1], s=4, c=(y_train == 1), cmap='coolwarm', linewidths=1)
ax1.legend(handles=[blue_patch, red_patch])
ax2.set_title('t-SNE', fontsize=16)
ax2.scatter(X_tsne[:,0], X_tsne[:,1], s=4, c=(y_train == 0), cmap='coolwarm', linewidths=1)
ax2.scatter(X_tsne[:,0], X_tsne[:,1], s=4, c=(y_train == 1), cmap='coolwarm', linewidths=1)
ax2.legend(handles=[blue_patch, red_patch])
<matplotlib.legend.Legend at 0x13362e69880>
model = LogisticRegression()
result = model.fit(X_train, y_train)
prediction_test = model.predict(X_test)
print(accuracy_score(y_test, prediction_test))
weights = pd.Series(model.coef_[0], index=X.columns.values)
print(weights.sort_values(ascending = False)[:20].plot(kind='bar' ,figsize=(20,4)))
0.96 AxesSubplot(0.125,0.125;0.775x0.755)
model = LogisticRegression()
result = model.fit(X_train, y_train)
prediction_test = model.predict(X_test)
print(accuracy_score(y_test, prediction_test))
coefficients = pd.Series(model.coef_[0], index=X.columns.values)
precision = round(precision_score(y_test,prediction_test),2)
recall = round(recall_score(y_test,prediction_test),2)
fscore = round(f1_score(y_test,prediction_test),2)
accuracy = round(accuracy_score(y_test,prediction_test),2)
print(f'Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nF score: {fscore}')
coefficients.sort_values(ascending = False)[:20].plot(kind='bar',figsize=(15,3))
0.96 Accuracy: 0.96 Precision: 0.97 Recall: 0.96 F score: 0.96
<AxesSubplot:>
model = RandomForestClassifier(n_estimators=100 , random_state =1, max_leaf_nodes = 100)
model.fit(X_train, y_train)
prediction_test = model.predict(X_test)
precision = round(precision_score(y_test,prediction_test),2)
recall = round(recall_score(y_test,prediction_test),2)
fscore = round(f1_score(y_test,prediction_test),2)
accuracy = round(accuracy_score(y_test,prediction_test),2)
print(f'Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nF score: {fscore}')
imp_features = model.feature_importances_
imp_features = pd.Series(imp_features, index=X_train.columns.values)
imp_features.sort_values()[-20:].plot(kind = 'barh', figsize=(5, 10))
Accuracy: 0.94 Precision: 0.93 Recall: 0.96 F score: 0.94
<AxesSubplot:>
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
ann_model = Sequential()
ann_model.add(Dense(61, activation='relu',input_dim = 61))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(30, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(15, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(7, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(3, activation='relu'))
ann_model.add(Dropout(0.2))
ann_model.add(Dense(1, activation = 'sigmoid'))
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = ann_model.fit(X_train, y_train, validation_split=0.20, batch_size = 1000, epochs = 200, verbose=False)
accuracy = ann_model.evaluate(X_train, y_train, verbose=False)
print("Training Score: {:.2f}".format(accuracy[0]))
print("Training Accuracy: {:.2f}".format(accuracy[1]))
accuracy = ann_model.evaluate(X_test, y_test, verbose=False)
print("Testing Score: {:.2f}".format(accuracy[0]))
print("Testing Accuracy: {:.2f}".format(accuracy[1]))
Training Score: 0.06 Training Accuracy: 0.99 Testing Score: 0.30 Testing Accuracy: 0.94
models = [('XGBClassifier', XGBClassifier(use_label_encoder=False)),
('Logestic Regression', LogisticRegression()),
('Support Vector Classification', SVC(kernel='linear')),
('AdaBoostClassifier', AdaBoostClassifier()),
('RandomForestClassifier', RandomForestClassifier(n_estimators=100 , random_state =1, max_leaf_nodes = 100))
]
model_performances_df = pd.DataFrame(columns = ['Model', 'Accuracy', 'F-Score', 'Precision','Recall'])
for model_name, model in models:
model.fit(X_train, y_train)
prediction_test = model.predict(X_test)
accuracy = round(accuracy_score(y_test,prediction_test), 4) * 100
f_score = round(f1_score(y_test,prediction_test), 4) * 100
precision = round(precision_score(y_test,prediction_test), 4) * 100
recall = round(recall_score(y_test,prediction_test), 4) * 100
model_performances_df = model_performances_df.append(
{
'Model' : model_name,
'Accuracy': accuracy,
'F-Score': f_score,
'Precision': precision,
'Recall': recall
},
ignore_index=True
)
model_performances_df
[22:13:53] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
| Model | Accuracy | F-Score | Precision | Recall | |
|---|---|---|---|---|---|
| 0 | XGBClassifier | 98.86 | 98.91 | 98.91 | 98.91 |
| 1 | Logestic Regression | 96.00 | 96.17 | 96.70 | 95.65 |
| 2 | Support Vector Classification | 98.29 | 98.38 | 97.85 | 98.91 |
| 3 | AdaBoostClassifier | 97.14 | 97.24 | 98.88 | 95.65 |
| 4 | RandomForestClassifier | 93.71 | 94.12 | 92.63 | 95.65 |
model = XGBClassifier(use_label_encoder=False)
predictions = model.fit(X_train, y_train).predict_proba(X_test)
FPR, TPR, thresholds = roc_curve(y_test, predictions[:,1])
plt.figure(figsize=(8,6))
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.yticks([i/10 for i in range(11)])
plt.xticks([i/10 for i in range(11)])
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.plot(FPR, TPR, color='red')
plt.xlabel('FPR-False Positive Rate')
plt.ylabel('TPR-True Positive Rate')
plt.title('ROC - Receiver Operating Characteristic curve for XGBoost model')
print('AUC Area Under Curve: {}'.format(auc(FPR, TPR)))
[22:53:54] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. AUC Area Under Curve: 0.9971189104243059
model = SVC(kernel= 'linear')
predictions = model.fit(X_train, y_train).decision_function(X_test)
FPR, TPR, thresholds = roc_curve(y_test, predictions)
plt.figure(figsize=(8,6))
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.yticks([i/10 for i in range(11)])
plt.xticks([i/10 for i in range(11)])
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.plot(FPR, TPR, color='red')
plt.xlabel('FPR-False Positive Rate')
plt.ylabel('TPR-True Positive Rate')
plt.title('ROC - Receiver Operating Characteristic curve for SVC model')
print('AUC Area Under Curve: {}'.format(auc(FPR, TPR)))
AUC Area Under Curve: 0.9876898899947616
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100 ],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
'kernel': ['linear','rbf', 'sigmoid']}
grid = GridSearchCV(SVC(), params, refit = True, verbose = 1, scoring='roc_auc')
grid_result = grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_estimator_)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
SVC(C=10, gamma=0.001, kernel='linear')
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
precision recall f1-score support
0 0.93 0.99 0.96 83
1 0.99 0.93 0.96 92
accuracy 0.96 175
macro avg 0.96 0.96 0.96 175
weighted avg 0.96 0.96 0.96 175